## "Strategic Partisans: Electoral Motivations and Partisanship in Local Government Communication"
## Justin de Benedictis-Kessner
## jdbk@hks.harvard.edu


#### Preamble -----------------------------------------------------------------

# library(tm)
# library(RCurl)
# library(topicmodels)
# library(stm)
# library(tools)
library(xtable)
# library(RTextTools)
library(stargazer)
library(foreign)
library(lfe)
library(broom)
library(janitor)
library(quanteda)
library(quanteda.textmodels)
library(caret)
library(glmnet)
library(LiblineaR)
library(tidyverse)
library(tidytext)

## define colors:
red_bbc = "#BB312E"
blue_bbc = "#306B9F"


#### Read data -----------------------------------------------------------------

## Mayoral press releases
# press releases from replication data for de Benedictis-Kessner (2021, PSRM)
# available here: https://doi.org/10.7910/DVN/AJ7LEB
metainfo <- read_rds("metainfo.rds")
nrow(metainfo) # 111892 documents, w/ text in content column (already stripped of stopwords)
writeLines(prettyNum(nrow(metainfo),big.mark = ","),"tables/n_documents.tex",sep = "%")

# From dBK & Warshaw 2016 data, with recent updates:
# elecs <- read_csv("../../../../CA City Councils/data/elecs_mayors_long.csv")
# 
# elecs <- elecs %>%
#       filter(year >= 1985) # 4 years before the first PR min(metainfo2$PRdate)
# 
# elecs <- elecs %>%
#       mutate(month = str_replace(month,"\\D",""),
#              month = as.numeric(month))
# table(elecs$month)

## subset to those elections in PR data:
length(unique(metainfo$PRcity))
# cityinfo <- read_csv("../PRinfo_summ_gc.csv")
# 
# library(ggmap)
# gc <- geocode(paste0(cityinfo$PRcity,cityinfo$state),source="google")
# cityinfo$lon <- gc$lon
# cityinfo$lat <- gc$lat
# 
# write_csv(cityinfo,"PRinfo_summ_gc2.csv")

cityinfo <- read_csv("PRinfo_summ_gc2.csv")
# fix consolidated cities:
cityinfo$fips[cityinfo$PRcity=="Honolulu"] <- 1517000
cityinfo$fips[cityinfo$PRcity=="Indianapolis"] <- 1836000
cityinfo$fips[cityinfo$PRcity=="Louisville"] <- 2148000

metainfo$place_fips <- cityinfo$fips[match(metainfo$PRcity,cityinfo$PRcity)]
length(unique(metainfo$place_fips)) # 50
unique(metainfo[,c("PRcity","place_fips")])

# elecs <- elecs %>%
#       filter(place_fips %in% unique(metainfo$place_fips))
# dim(elecs) # down to 2224 cand-elections
# length(unique(elecs$place_fips)) # in 50 cities
# unique(metainfo$place_fips)[-which(unique(metainfo$place_fips) %in% elecs$place_fips)] # should be none
# unique(metainfo$PRcity[-which(metainfo$place_fips %in% elecs$place_fips)]) # consolidated cities fixed - should be none now

# elecs[elecs$lastname=="bloomberg","pid_final"] <- "Rep" # fix Bloomberg
# elecs$votes[elecs$place_fips==4827000 & elecs$lastname=="price" & elecs$year==2013] <- 1
# elecs$votes[elecs$place_fips==4827000 & elecs$lastname=="barr" & elecs$year==1999] <- 1
# elecs$winner[elecs$place_fips==4827000 & elecs$lastname=="barr" & elecs$year==1999] <- "win"
# elecs$winner[elecs$place_fips==4827000 & elecs$lastname=="na" & elecs$year==1999] <- "lose"

## create voteshare:
# elecs <- elecs %>%
#       filter(rvotes<=2) %>%
#       group_by(place_fips,office_consolidated,year,month) %>%
#       mutate(voteshare_toptwo = votes/sum(votes,na.rm=T))

# write_csv(elecs,"mayoral_elecs_prcities.csv")
elecs <- read_csv("mayoral_elecs_prcities.csv")
# fill mayoral party ID down to next 4 years
pid_by_year <- elecs %>%
      ungroup() %>%
      filter(winner == "win") %>%
      mutate(mayor_city = paste0(tolower(str_replace(city," ","")),"_",lastname,"_",year),
             mayor_firstnamelastname = paste(firstname,lastname)) %>%
      select(place_fips,year,mayor_city,mayor_firstnamelastname,pid_final,voteshare_toptwo)
dim(pid_by_year) # 538 elections
length(unique(pid_by_year$place_fips)) # in 50 cities

#### Descriptive plots: --------------------------------------------------------
## Temporal plot:
metainfo2 <- metainfo %>%
      filter(!is.na(PRdate))
metainfo2$plotorder <- ((max(as.numeric(factor(metainfo2$PRcity)))+1)-as.numeric(factor(metainfo2$PRcity)))

pid_by_elecs <- elecs %>%
      ungroup() %>%
      filter(winner == "win") %>%
      select(place_fips,year,month,pid_final) %>%
      mutate(elecdate = as.Date(paste(year,month,"01",sep="-")))

pid_by_elecs$plotorder <- metainfo2$plotorder[match(pid_by_elecs$place_fips,metainfo2$place_fips)]


pdf("figures/timeseries_zoom_bypid_horiz.pdf",width=12,height=8)
par(mar=c(3,8,0,3))
plot(NULL, xlim=c(as.Date("1998-12-31"),as.Date("2016-12-31")),
     ylim=c(min(metainfo2$plotorder),max(metainfo2$plotorder)),
     axes=F,xlab="Date",ylab="")
abline(h=c(min(metainfo2$plotorder)+0.5:max(metainfo2$plotorder)-0.5),
       col="lightgrey")
points(metainfo2$PRdate,
       jitter(metainfo2$plotorder),
       pch=".",cex=1)
# points(x=pid_by_elecs$elecdate[pid_by_elecs$elecdate<="2016-12-31"],
#        y=pid_by_elecs$plotorder[pid_by_elecs$elecdate<="2016-12-31"],
#        pch="l",cex=1,
#        col=ifelse(is.na(pid_by_elecs$pid_final[pid_by_elecs$elecdate<="2016-12-31"]),
#                   "black",ifelse(pid_by_elecs$pid_final[pid_by_elecs$elecdate<="2016-12-31"]=="Dem","blue",ifelse(pid_by_elecs$pid_final[pid_by_elecs$elecdate<="2016-12-31"]=="Rep","red","black"))),
#        lty=ifelse(is.na(pid_by_elecs$pid_final[pid_by_elecs$elecdate<="2016-12-31"]),
#                   "dashed",ifelse(pid_by_elecs$pid_final[pid_by_elecs$elecdate<="2016-12-31"]=="Dem","solid",ifelse(pid_by_elecs$pid_final[pid_by_elecs$elecdate<="2016-12-31"]=="Rep","dotted","blank"))))

segments(x0=pid_by_elecs$elecdate[pid_by_elecs$elecdate<="2016-12-31"],
         x1=pid_by_elecs$elecdate[pid_by_elecs$elecdate<="2016-12-31"],
         y0=pid_by_elecs$plotorder[pid_by_elecs$elecdate<="2016-12-31"]-0.5,
         y1=pid_by_elecs$plotorder[pid_by_elecs$elecdate<="2016-12-31"]+0.5,
         cex=1,
         lwd=1,
         col=ifelse(is.na(pid_by_elecs$pid_final[pid_by_elecs$elecdate<="2016-12-31"]),
                    "black",ifelse(pid_by_elecs$pid_final[pid_by_elecs$elecdate<="2016-12-31"]=="Dem","blue",ifelse(pid_by_elecs$pid_final[pid_by_elecs$elecdate<="2016-12-31"]=="Rep","red","black"))),
         lty=ifelse(is.na(pid_by_elecs$pid_final[pid_by_elecs$elecdate<="2016-12-31"]),
                    "dashed",ifelse(pid_by_elecs$pid_final[pid_by_elecs$elecdate<="2016-12-31"]=="Dem","solid",ifelse(pid_by_elecs$pid_final[pid_by_elecs$elecdate<="2016-12-31"]=="Rep","dotted","blank"))))

axis(1,at=seq(as.Date("1998-12-31"),as.Date("2016-12-31"),by=365),
     labels=F,tick=T,line = -1)
text(x=seq(as.Date("1998-12-31"),as.Date("2016-12-31"),by=365),
     y=-0.7,
     cex = 0.7,
     labels=format(seq(as.Date("1998-12-31"),as.Date("2016-12-31"),by=365),"%b. '%y"),
     srt=45,pos=1,xpd=T)
axis(2,at=max(metainfo2$plotorder):1,
     labels=unique(factor(metainfo2$PRcity)),
     cex.axis=0.7,las=1)
axis(4,at=max(metainfo2$plotorder):1,
     labels=tapply(metainfo2$PRdate,metainfo2$PRcity,length),
     cex.axis=0.7,las=1)
axis(4,at=max(metainfo2$plotorder)+1.0,
     labels="n =",cex.axis=0.7,las=1,tick=F)
dev.off()



#### Maps:
library(sf)
library(tidycensus)


simpleCap <- function(x) {
      s <- strsplit(x, " ")[[1]]
      paste(toupper(substring(s, 1,1)), substring(s, 2),
            sep="", collapse=" ")
}

states <- get_acs(geography = "state",
                  variables = "B25077_001",
                  survey = "acs1",
                  geometry = TRUE,
                  shift_geo = TRUE)

# states <- states[-which(states$STUSPS %in% c("MP","VI","PR","GU","AS")),]

# shift AK/HI city points:
b <- data.frame(lon=cityinfo$lon,lat=cityinfo$lat,fips=cityinfo$fips,city=cityinfo$PRcity,state=cityinfo$state,plotname=paste(cityinfo$PRcity,cityinfo$state))
b <- b %>% filter(!is.na(fips))

b <- st_as_sf(b,coords=c("lon","lat"))
st_crs(b) <- 4326
b <- st_transform(b,st_crs(states))

# adjusting honolulu coordinates:
data(county_laea) # loads county data in with alaska/HI shifted
# counties(state="HI") # get fips code for Honolulu
hon_cent <- county_laea %>% 
      filter(GEOID==15003) %>% # Honolulu
      st_centroid() # centroid of county

b$geometry[which(b$state=="HI")] <- hon_cent$geometry # assign county centroid to city

# b <- data.frame(st_coordinates(b),plotname = b$plotname)

citymap <- ggplot(states) + 
      geom_sf(lwd=0.5,col="lightgrey",lty=1, border="white") + 
      geom_sf(data = b,size=0.5) + 
      ggrepel::geom_text_repel(data=b,aes(label=plotname, geometry=geometry),stat="sf_coordinates",size=2,min.segment.length = 0.15) + 
      theme_minimal() + 
      theme(axis.line = element_blank(),axis.text = element_blank(),
            panel.grid = element_blank(),axis.title = element_blank())

ggsave("figures/cities_map.pdf",citymap,height=4,width=6)


## Fill mayoral administration + party ID down for entire mayoral term:

pid_by_year <- pid_by_year %>%
      group_by(place_fips) %>%
      tidyr::complete(year = seq.int(1985, 2019,by = 1)) %>% # expand to have all years
      mutate(party_in_power = lag(pid_final,order_by = year,n = 1),
             mayor_in_power = lag(mayor_city,order_by = year,n = 1),
             mayor_firstnamelastname_in_power = lag(mayor_firstnamelastname,order_by = year,n = 1),
             voteshare_toptwo = lag(voteshare_toptwo,order_by = year,n = 1)) %>% # assuming party takes power year after election
      fill(place_fips,party_in_power,mayor_in_power,mayor_firstnamelastname_in_power,voteshare_toptwo,.direction = "down") # fill winner+PID down

metainfo <- metainfo %>%
      left_join(select(pid_by_year,-pid_final,-mayor_city,-mayor_firstnamelastname),by=c("place_fips" = "place_fips","PRyear" = "year"))

tabyl(metainfo,party_in_power) # 73% Dem, 27% Rep
tabyl(metainfo,mayor_in_power) # 7555 NA - from no-date PRs
summary(metainfo$voteshare_toptwo) # 0.5 to 1, mean=0.674

## need to replace mayor's name:
metainfo <- metainfo %>%
      mutate(mayor_lastname = str_replace(mayor_in_power,".*\\_(.*)\\_.*","\\1"))

metainfo <- metainfo %>%
      mutate(content2 = map2_chr(.x = content,.y = mayor_firstnamelastname_in_power,.f =  ~ str_replace_all(string = .x, pattern = .y,replacement = "mayorname")),
             content2 = map2_chr(.x = content2,.y = mayor_lastname,.f =  ~ str_replace_all(string = .x, pattern = .y,replacement = "mayorname")))


## turn into tdm with columns for mayoral admin, rows for vocab, cells with counts
content_by_mayor <- metainfo %>%
      # slice(1:30000) %>%
      group_by(PRcity,mayor_in_power,party_in_power,voteshare_toptwo) %>%
      summarize(content_grouped = str_c(content2,collapse = " "),
                n=n()) %>%
      rename(doc_id = mayor_in_power,
             text=content_grouped) %>%
      mutate(text = str_trim(text,"both")) %>%
      filter(n>=10) %>% # subset to mayoral admins with at least 10 PRs
      filter(!is.na(doc_id))
      

corpus_mayors <- quanteda::corpus(content_by_mayor)

dfm_mayors <- quanteda::dfm(corpus_mayors, stem=TRUE)
tidy_mayors <- tidy(dfm_mayors)
tidy_mayors$party <- quanteda::docvars(dfm_mayors,"party_in_power")[match(tidy_mayors$document,rownames(dfm_mayors))]
tdm_mayors <- tidytext::cast_tdm(data=tidy_mayors, document=document, term=term, value=count)
tdm_mayors_simple <- as.matrix(tdm_mayors)
tdm_mayors_tib <- as_tibble(tdm_mayors_simple)
tdm_mayors_tib$vocab <- rownames(tdm_mayors_simple)

tdm_mayors_mat <- as.matrix(select(tdm_mayors_tib,-vocab))
rownames(tdm_mayors_mat) <- tdm_mayors_tib$vocab
colnames(tdm_mayors_mat)

top_terms_byparty <- tidy_mayors %>%
      group_by(party,term) %>%
      summarize(count = sum(count)) %>%
      group_by(party) %>%
      top_n(10) %>%
      mutate(term = str_replace(term,"citynam","cityname"),
             term = str_replace(term,"mayornam","mayorname"),
             party=factor(party,levels=c("Dem","Rep"),labels=c("Democrat","Republican")))

(hist_topterms_byparty <- ggplot(top_terms_byparty,aes(fill=party)) + 
      geom_col(aes(x=tidytext::reorder_within(term,count,party),y=count),alpha=1) + 
      facet_wrap(~party,scales="free") + 
      tidytext::scale_x_reordered() +
      coord_flip() + 
      scale_y_continuous(expand = c(0, 0),labels=scales::number_format(big.mark=",")) +
      scale_fill_manual(breaks = c("Democrat","Republican"),values=c(blue_bbc,red_bbc)) + 
      labs(x = NULL, y = "Word count") + 
      theme_minimal() + 
      theme(legend.pos="none")
)
ggsave("figures/topterms_byparty.pdf",hist_topterms_byparty,height=4,width=8)




mayor_summ <- data.frame(mayor = rownames(dfm_mayors)
                         )
mayor_summ$PRcity <- content_by_mayor$PRcity[match(mayor_summ$mayor,content_by_mayor$doc_id)]
mayor_summ$party <- content_by_mayor$party_in_power[match(mayor_summ$mayor,content_by_mayor$doc_id)]
(n_cities <- length(unique(mayor_summ$PRcity))) # 48 cities
(n_mayoral_admins <- length(unique(mayor_summ$mayor))) # 141 mayoral administrations
writeLines(as.character(n_cities),"tables/n_cities.tex",sep="%")
writeLines(as.character(n_mayoral_admins),"tables/n_mayoral_admins.tex",sep="%")


## Bring in city ideology/partisanship from Tausanovitch and Warshaw (2014, APSR)
# available here: https://doi.org/10.7910/DVN/AXVEXM

TW_city_ideology <- read_csv("TW_cities_ideology.csv")
# 10 cities missing pres_2008:
TW_city_ideology$pres_2008[TW_city_ideology$city=="Boston"] <- 0.7894 # from city's website
TW_city_ideology$pres_2008[TW_city_ideology$city=="Indianapolis"] <- 0.637879 # from city website
TW_city_ideology$pres_2008[TW_city_ideology$city=="Louisville"] <- 0.5535 # from Jefferson County site
TW_city_ideology$pres_2008[TW_city_ideology$city=="Washington"] <- 0.9246


mayor_summ$place_fips <- pid_by_year$place_fips[match(mayor_summ$mayor,pid_by_year$mayor_city)]
mayor_summ <- left_join(mayor_summ, TW_city_ideology, by="place_fips")




#### SVM trained internally on mayoral PRs ####
# setup loop for k-fold classification
sims <- 1000
folds <- 5
ideal_mat_svm <- data.frame(mayor = rownames(dfm_mayors))
pred_mat_svm <- data.frame(mayor = rownames(dfm_mayors))
validation_mat_svm <- data.frame(mayor = rownames(dfm_mayors))
set.seed(02139)
start_kfold <- Sys.time()
for(i in 1:sims){
      # create folds:
      folded <- caret::createFolds(1:nrow(dfm_mayors),k = folds,returnTrain=T)
      for(j in 1:length(folded)){
            quanteda::docvars(dfm_mayors,"partylab_this") <- docvars(dfm_mayors,"party_in_power")
            # setup training/test data
            indices <- folded[[j]] # training data
            # downsampling Dems to get class balance in training data:
            n_rep <- sum(docvars(dfm_mayors,"party_in_power")[indices] == "Rep")
            n_dem <- sum(docvars(dfm_mayors,"party_in_power")[indices] == "Dem")
            dem_ds_ind <- sample(indices[docvars(dfm_mayors,"party_in_power")[indices] == "Dem"],n_rep,replace = F)
            rep_ind <- indices[docvars(dfm_mayors,"party_in_power")[indices] == "Rep"]
            indices_downsampled <- c(dem_ds_ind,rep_ind)
            # set non-training data to be NA
            docvars(dfm_mayors,"partylab_this")[-indices_downsampled] <- NA
            this_out_svm <- LiblineaR(data = as.matrix(dfm_mayors[indices_downsampled,]), 
                                      target = docvars(dfm_mayors,"partylab_this")[indices_downsampled],
                                      type=2)
            this_pred <- predict(this_out_svm,newx=as.matrix(dfm_mayors[-indices,]),decisionValues = T,probability = T)
            this_pred_prob <- this_pred$decisionValues[,1]
            
            this_ideal_df <- data.frame(rep(NA,nrow(dfm_mayors)))
            this_ideal_df[-indices,1] <- this_pred_prob
            names(this_ideal_df) <- paste0("ideal_",i,"_fold_",j)
            ideal_mat_svm <- bind_cols(ideal_mat_svm,this_ideal_df)
            
            this_pred_df <- data.frame(rep(NA,nrow(dfm_mayors)))
            this_pred_df[-indices,1] <- as.character(this_pred$predictions)
            names(this_pred_df) <- paste0("pred_",i,"_fold_",j)
            pred_mat_svm <- bind_cols(pred_mat_svm,this_pred_df)

            pred_correct <- data.frame(docvars(dfm_mayors,"party_in_power") == this_pred_df)
            names(pred_correct) <- paste0("correct_",i,"_fold_",j)
            validation_mat_svm <- bind_cols(validation_mat_svm,pred_correct)
      }
}
end_kfold <- Sys.time()

end_kfold-start_kfold # 137 mins for 5-fold w/ 1000 sims


(accuracy_total <- apply(select(validation_mat_svm,correct_1_fold_1:last_col()),1,mean,na.rm=T) %>%
      mean(na.rm=T))
writeLines(as.character(round(100*accuracy_total,1)),"tables/svm_accuracy_total.tex",sep = "%")

## confusion matrix (from sim 1, folds 1:k)
confusion_mat <- data.frame(pred_dem = c(sum(pred_mat_svm[,2:6] == "Dem" & docvars(dfm_mayors,"party_in_power")=="Dem", na.rm=T), sum(pred_mat_svm[,2:6] == "Dem" & docvars(dfm_mayors,"party_in_power")=="Rep", na.rm=T)),
                            pred_rep = c(sum(pred_mat_svm[,2:6] == "Rep" & docvars(dfm_mayors,"party_in_power")=="Dem", na.rm=T), sum(pred_mat_svm[,2:6] == "Rep" & docvars(dfm_mayors,"party_in_power")=="Rep", na.rm=T))
)
rownames(confusion_mat) <- c("Actual Democrat","Actual Republican")
colnames(confusion_mat) <- c("Predicted Democrat","Predicted Republican")
print(xtable(confusion_mat),floating=F,file="tables/confusion_mat_svm.tex")


mayor_summ$svm_prob_dem <- apply(select(pred_mat_svm,pred_1_fold_1:last_col()),1,function(x) mean(x=="Dem",na.rm=T))
mayor_summ$svm_ideal <- apply(select(ideal_mat_svm,ideal_1_fold_1:last_col()),1,mean,na.rm=T)
mayor_summ$svm_prob_correct <- apply(select(validation_mat_svm,correct_1_fold_1:last_col()),1,mean,na.rm=T)


## ROC curve:
ideal_mat_svm_first <- data.frame(mayor = rownames(dfm_mayors))
pred_mat_svm_first <- data.frame(mayor = rownames(dfm_mayors))
validation_mat_svm_first <- data.frame(mayor = rownames(dfm_mayors))

# create folds:
folds <- 5
set.seed(02139)
folded <- caret::createFolds(1:nrow(dfm_mayors),k = folds,returnTrain=T)
for(j in 1:length(folded)){
      quanteda::docvars(dfm_mayors,"partylab_this") <- docvars(dfm_mayors,"party_in_power")
      # setup training/test data
      indices <- folded[[j]] # training data
      # downsampling Dems to get class balance in training data:
      n_rep <- sum(docvars(dfm_mayors,"party_in_power")[indices] == "Rep")
      n_dem <- sum(docvars(dfm_mayors,"party_in_power")[indices] == "Dem")
      dem_ds_ind <- sample(indices[docvars(dfm_mayors,"party_in_power")[indices] == "Dem"],n_rep,replace = F)
      rep_ind <- indices[docvars(dfm_mayors,"party_in_power")[indices] == "Rep"]
      indices_downsampled <- c(dem_ds_ind,rep_ind)
      # set non-training data to be NA
      docvars(dfm_mayors,"partylab_this")[-indices_downsampled] <- NA
      this_out_svm <- LiblineaR(data = as.matrix(dfm_mayors[indices_downsampled,]), 
                                target = docvars(dfm_mayors,"partylab_this")[indices_downsampled],
                                type=2)
      this_pred <- predict(this_out_svm,newx=as.matrix(dfm_mayors[-indices,]),decisionValues = T,probability = T)
      this_pred_prob <- this_pred$decisionValues[,1]
      
      this_ideal_df <- data.frame(rep(NA,nrow(dfm_mayors)))
      this_ideal_df[-indices,1] <- this_pred_prob
      names(this_ideal_df) <- paste0("ideal_fold_",j)
      ideal_mat_svm_first <- bind_cols(ideal_mat_svm_first,this_ideal_df)
      
      this_pred_df <- data.frame(rep(NA,nrow(dfm_mayors)))
      this_pred_df[-indices,1] <- as.character(this_pred$predictions)
      names(this_pred_df) <- paste0("pred_fold_",j)
      pred_mat_svm_first <- bind_cols(pred_mat_svm_first,this_pred_df)
      
      pred_correct <- data.frame(docvars(dfm_mayors,"party_in_power") == this_pred_df)
      names(pred_correct) <- paste0("correct_fold_",j)
      validation_mat_svm_first <- bind_cols(validation_mat_svm_first,pred_correct)
}


ideal_mat_svm_first <- ideal_mat_svm_first %>%
      mutate(decision_value = coalesce(ideal_fold_1,ideal_fold_2,ideal_fold_3,ideal_fold_4,ideal_fold_5),
             true_label = docvars(dfm_mayors,"party_in_power")) %>% 
      select(mayor,true_label,decision_value)

roc_mat <- data.frame(threshold = sort(ideal_mat_svm_first$decision_value),
                      tpr = NA,
                      fpr = NA)
for(j in 1:length(roc_mat$threshold)){
      roc_mat$tpr[j] <- sum(ideal_mat_svm_first$decision_value>=roc_mat$threshold[j] & ideal_mat_svm_first$true_label=="Dem")/sum(ideal_mat_svm_first$true_label=="Dem")
      roc_mat$fpr[j] <- sum(ideal_mat_svm_first$decision_value>=roc_mat$threshold[j] & ideal_mat_svm_first$true_label!="Dem")/sum(ideal_mat_svm_first$true_label!="Dem")
}

roc_mat <- roc_mat %>%
      arrange(tpr)

(roc_curve_svm <- ggplot(roc_mat) + 
            geom_line(aes(x=fpr,y=tpr),col="blue") + 
            geom_abline(slope=1,intercept=0,lty=2) + 
            coord_cartesian(xlim=c(0,1),ylim=c(0,1)) + 
            xlab("False Positive Rate") + 
            ylab("True Positive Rate") + 
            ggtitle("ROC Curve, SVM") +
            theme_minimal()
)
ggsave(plot=roc_curve_svm,filename="figures/roc_curve_svm.pdf",height=5,width=6)

auc_mat <- pred_mat_svm_first %>%
      mutate(pred = coalesce(pred_fold_1,pred_fold_2,pred_fold_3,pred_fold_4,pred_fold_5),
             true_label = docvars(dfm_mayors,"party_in_power"),
             pred = recode(pred,"Dem"=1,"Rep"=0),
             true_label = recode(true_label,"Dem"=1,"Rep"=0)) %>% 
      select(mayor,true_label,pred)

roc_svm <- pROC::roc(auc_mat,response=true_label,predictor=pred)
(auc_svm <- pROC::auc(roc_svm))
writeLines(as.character(round(auc_svm,2)),con = "tables/svm_auc.tex",sep = "%")


## Most predictive words:
word_loadings <- this_out_svm$W %>%
      t() %>%
      as_tibble(rownames = "Word") %>%
      rename(Coefficient = V1) %>%
      arrange(desc(abs(Coefficient)))
# higher prob = more Dem, so positive loading = more predictive of Dem


word_loadings_dem <- word_loadings %>%
      filter(Coefficient>0) %>%
      arrange(desc(abs(Coefficient)))
print(xtable(word_loadings_dem[1:50,],digits = 4),row.names=F,floating=F,file="tables/word_loadings_svm_top50dem.tex")

word_loadings_rep <- word_loadings %>%
      filter(Coefficient<0) %>%
      arrange(desc(abs(Coefficient)))
print(xtable(word_loadings_rep[1:50,],digits = 4),row.names=F,floating=F,file="tables/word_loadings_svm_top50rep.tex")


word_loadings_sub <- word_loadings_rep %>%
      slice(1:10) %>%
      mutate(plotorder = 10:1,
             Coefficient = abs(Coefficient),
             party="Republican")

word_loadings_sub <- word_loadings_dem %>%
      slice(1:10) %>%
      mutate(plotorder = 10:1,
             Coefficient = abs(Coefficient),
             party="Democrat") %>%
      # mutate(Word = factor(Word,levels = plotorder,labels=Word,ordered=T)) %>%
      bind_rows(word_loadings_sub) %>%
      arrange(plotorder) %>%
      mutate(Word = factor(Word,levels=Word,ordered=T))
      

(topcoefs_both <- ggplot(word_loadings_sub) + 
            geom_bar(aes(x=Word,y=Coefficient,fill=party),stat="identity") + 
            scale_x_discrete("") + 
            coord_flip() + 
            facet_wrap(~ party,scales = "free") + 
            scale_fill_manual(values=c("Democrat" = blue_bbc,"Republican" = red_bbc)) + 
            theme_minimal() + 
            theme(legend.pos="none")
)
ggsave(topcoefs_both,filename = "figures/topcoefs_both_svm.pdf",height=4,width=8)


mayor_summ <- mayor_summ %>%
      mutate(plotorder = rank(svm_prob_correct,ties.method = "first"))

(prob_correct_ordered <- ggplot(mayor_summ) + 
            geom_point(aes(x=svm_prob_correct,y=plotorder,col=party,pch=party)) +
            scale_color_manual("Mayor PID",breaks = c("Dem","Rep"),values=c(blue_bbc,red_bbc)) + 
            scale_shape_discrete("Mayor PID") + 
            scale_x_continuous(paste0("SVM Prob. Correct, ",sims," sims")) + 
            scale_y_continuous("",breaks=mayor_summ$plotorder,labels=mayor_summ$mayor,minor_breaks = NULL) + 
            theme_minimal() + 
            theme(legend.pos=c(0.8,0.1),legend.background = element_rect(fill="white",linetype = 0),plot.margin = unit(c(-3,0,0,0),"lines"))
)
ggsave(prob_correct_ordered,filename = "figures/prob_correct_ordered.pdf",height=16,width=10)


(prob_correct_byTW <- ggplot(mayor_summ) + 
            geom_point(aes(y=svm_prob_correct,x=mrp_ideology,col=party,pch=party)) +
            geom_smooth(data=filter(mayor_summ,party=="Dem"),aes(y=svm_prob_correct,x=mrp_ideology),formula = y ~ x, method="lm",col=blue_bbc) + 
            geom_smooth(data=filter(mayor_summ,party=="Rep"),aes(y=svm_prob_correct,x=mrp_ideology),formula = y ~ x, method="lm",col=red_bbc,lty=2) + 
            # geom_smooth(aes(y=svm_prob_dem,x=mrp_ideology),formula = y ~ x, method="lm") + 
            scale_shape_discrete("Mayoral PID") + 
            scale_color_manual("Mayoral PID",breaks = c("Dem","Rep"),values=c(blue_bbc,red_bbc)) + 
            scale_y_continuous(paste0("SVM Prob. Correct")) + 
            scale_x_continuous("T&W Ideology") + 
            theme_minimal()
)
# ggsave(prob_correct_byTW,filename = "figures/prob_correct_byTW.pdf",height=4,width=5)

(prob_correct_bypres2008_noleg <- ggplot(mayor_summ) + 
            geom_point(aes(y=svm_prob_correct,x=pres_2008,col=party,pch=party)) +
            geom_smooth(data=filter(mayor_summ,party=="Dem"),aes(y=svm_prob_correct,x=pres_2008),formula = y ~ x, method="lm",col=blue_bbc) + 
            geom_smooth(data=filter(mayor_summ,party=="Rep"),aes(y=svm_prob_correct,x=pres_2008),formula = y ~ x, method="lm",col=red_bbc,lty=2) + 
            # geom_smooth(aes(y=svm_prob_dem,x=pres_2008),formula = y ~ x, method="lm") + 
            scale_shape_discrete("Mayoral PID") + 
            scale_color_manual("Mayoral PID",breaks = c("Dem","Rep"),values=c(blue_bbc,red_bbc)) + 
            scale_y_continuous(paste0("SVM Prob. Correct")) + 
            scale_x_continuous("2008 Democratic Presidential Voteshare") + 
            theme_minimal() + theme(legend.pos="none")
)
# ggsave(prob_correct_bypres2008_noleg,filename = "figures/prob_correct_bypres2008_noleg.pdf",height=4,width=5)

ggsave(gridExtra::arrangeGrob(prob_correct_bypres2008_noleg,prob_correct_byTW,widths=c(4,5),ncol=2,padding = 1),filename = "figures/prob_correct_bypres2008_TW.pdf",height=4,width=10)


## Ridge LR trained internally on mayoral PRs ----------------------------------------

# setup loop for k-fold classification
sims <- 1000
folds <- 5
validation_mat_lr <- data.frame(mayor = rownames(dfm_mayors))
pred_mat_lr <- data.frame(mayor = rownames(dfm_mayors))
ideal_mat_lr <- data.frame(mayor = rownames(dfm_mayors))
set.seed(02139)
start_kfold <- Sys.time()
for(i in 1:sims){
      # create folds:
      folded <- caret::createFolds(1:nrow(dfm_mayors),k = folds,returnTrain=T)
      for(j in 1:length(folded)){
            quanteda::docvars(dfm_mayors,"partylab_this") <- docvars(dfm_mayors,"party_in_power")
            # setup training/test data
            indices <- folded[[j]] # training data
            # downsampling Dems to get class balance in training data:
            n_rep <- sum(docvars(dfm_mayors,"party_in_power")[indices] == "Rep")
            n_dem <- sum(docvars(dfm_mayors,"party_in_power")[indices] == "Dem")
            dem_ds_ind <- sample(indices[docvars(dfm_mayors,"party_in_power")[indices] == "Dem"],n_rep,replace = F)
            rep_ind <- indices[docvars(dfm_mayors,"party_in_power")[indices] == "Rep"]
            indices_downsampled <- c(dem_ds_ind,rep_ind)
            # set non-training data to be NA
            docvars(dfm_mayors,"partylab_this")[-indices_downsampled] <- NA
            this_out_lr <- LiblineaR(data = as.matrix(dfm_mayors[indices_downsampled,]), 
                                     target = docvars(dfm_mayors,"partylab_this")[indices_downsampled],
                                     type=0)
            this_pred <- predict(this_out_lr,newx=as.matrix(dfm_mayors[-indices,]),proba=T,decisionValues = T)
            this_pred_prob <- this_pred$probabilities[,1]
            
            this_ideal_df <- data.frame(rep(NA,nrow(dfm_mayors)))
            this_ideal_df[-indices,1] <- this_pred_prob
            names(this_ideal_df) <- paste0("ideal_",i,"_fold_",j)
            ideal_mat_lr <- bind_cols(ideal_mat_lr,this_ideal_df)
            
            this_pred_df <- data.frame(rep(NA,nrow(dfm_mayors)))
            this_pred_df[-indices,1] <- as.character(this_pred$predictions)
            names(this_pred_df) <- paste0("pred_",i,"_fold_",j)
            pred_mat_lr <- bind_cols(pred_mat_lr,this_pred_df)
            
            pred_correct <- data.frame(docvars(dfm_mayors,"party_in_power") == this_pred_df)
            names(pred_correct) <- paste0("correct_",i,"_fold_",j)
            validation_mat_lr <- bind_cols(validation_mat_lr,pred_correct)
      }
}
end_kfold <- Sys.time()

end_kfold-start_kfold # 64 mins for 5-fold w/ 1000 sims

apply(select(pred_mat_lr,pred_1_fold_1:last_col()),1,function(x) mean(x=="Dem",na.rm=T))


(accuracy_total_lr <- apply(select(validation_mat_lr,correct_1_fold_1:last_col()),1,mean,na.rm=T) %>%
            mean(na.rm=T)) # 70.1% correct
writeLines(as.character(round(100*accuracy_total_lr,1)),"tables/lr_accuracy_total.tex",sep = "%")


## confusion matrix (from sim 1, folds 1:k):
confusion_mat_lr <- data.frame(pred_dem = c(sum(pred_mat_lr[,2:6] == "Dem" & docvars(dfm_mayors,"party_in_power")=="Dem", na.rm=T), 
                                            sum(pred_mat_lr[,2:6] == "Dem" & docvars(dfm_mayors,"party_in_power")=="Rep", na.rm=T)),
                               pred_rep = c(sum(pred_mat_lr[,2:6] == "Rep" & docvars(dfm_mayors,"party_in_power")=="Dem", na.rm=T), 
                                            sum(pred_mat_lr[,2:6] == "Rep" & docvars(dfm_mayors,"party_in_power")=="Rep",na.rm=T))
)
rownames(confusion_mat_lr) <- c("Actual Democrat","Actual Republican")
colnames(confusion_mat_lr) <- c("Predicted Democrat","Predicted Republican")
print(xtable(confusion_mat_lr),floating=F,file="tables/confusion_mat_lr.tex")


mayor_summ$lr_prob_dem <- apply(select(pred_mat_lr,pred_1_fold_1:last_col()),1,function(x) mean(x=="Dem",na.rm=T))
mayor_summ$lr_ideal <- apply(select(ideal_mat_lr,ideal_1_fold_1:last_col()),1,mean,na.rm=T)
mayor_summ$lr_prob_correct <- apply(select(validation_mat_lr,correct_1_fold_1:last_col()),1,mean,na.rm=T)

## ROC Curve:
validation_mat_lr_first <- data.frame(mayor = rownames(dfm_mayors))
ideal_mat_lr_first <- data.frame(mayor = rownames(dfm_mayors))
pred_mat_lr_first <- data.frame(mayor = rownames(dfm_mayors))
set.seed(02139)
folds <- 5
folded <- caret::createFolds(1:nrow(dfm_mayors),k = folds,returnTrain=T)
for(j in 1:length(folded)){
      quanteda::docvars(dfm_mayors,"partylab_this") <- docvars(dfm_mayors,"party_in_power")
      # setup training/test data
      indices <- folded[[j]] # training data
      # downsampling Dems to get class balance in training data:
      n_rep <- sum(docvars(dfm_mayors,"party_in_power")[indices] == "Rep")
      n_dem <- sum(docvars(dfm_mayors,"party_in_power")[indices] == "Dem")
      dem_ds_ind <- sample(indices[docvars(dfm_mayors,"party_in_power")[indices] == "Dem"],n_rep,replace = F)
      rep_ind <- indices[docvars(dfm_mayors,"party_in_power")[indices] == "Rep"]
      indices_downsampled <- c(dem_ds_ind,rep_ind)
      # set non-training data to be NA
      docvars(dfm_mayors,"partylab_this")[-indices_downsampled] <- NA
      this_out_lr <- LiblineaR(data = as.matrix(dfm_mayors[indices_downsampled,]), 
                               target = docvars(dfm_mayors,"partylab_this")[indices_downsampled],
                               type=0)
      this_pred <- predict(this_out_lr,newx=as.matrix(dfm_mayors[-indices,]),proba=T,decisionValues = T)
      this_pred_prob <- this_pred$probabilities[,1]
      
      this_ideal_df <- data.frame(rep(NA,nrow(dfm_mayors)))
      this_ideal_df[-indices,1] <- this_pred_prob
      names(this_ideal_df) <- paste0("ideal_fold_",j)
      ideal_mat_lr_first <- bind_cols(ideal_mat_lr_first,this_ideal_df)
      
      this_pred_df <- data.frame(rep(NA,nrow(dfm_mayors)))
      this_pred_df[-indices,1] <- as.character(this_pred$predictions)
      names(this_pred_df) <- paste0("pred_fold_",j)
      pred_mat_lr_first <- bind_cols(pred_mat_lr_first,this_pred_df)
      
      pred_correct <- data.frame(docvars(dfm_mayors,"party_in_power") == this_pred_df)
      names(pred_correct) <- paste0("correct_",i,"_fold_",j)
      validation_mat_lr_first <- bind_cols(validation_mat_lr_first,pred_correct)
}
ideal_mat_lr_first <- ideal_mat_lr_first %>%
      mutate(prob = coalesce(ideal_fold_1,ideal_fold_2,ideal_fold_3,ideal_fold_4,ideal_fold_5),
             true_label = docvars(dfm_mayors,"party_in_power")) %>% 
      select(mayor,true_label,prob)


      

roc_mat <- data.frame(threshold = sort(ideal_mat_lr_first$prob),
                      tpr = NA,
                      fpr = NA)
for(j in 1:length(roc_mat$threshold)){
      roc_mat$tpr[j] <- sum(ideal_mat_lr_first$prob>=roc_mat$threshold[j] & ideal_mat_lr_first$true_label=="Dem")/sum(ideal_mat_lr_first$true_label=="Dem")
      roc_mat$fpr[j] <- sum(ideal_mat_lr_first$prob>=roc_mat$threshold[j] & ideal_mat_lr_first$true_label!="Dem")/sum(ideal_mat_lr_first$true_label!="Dem")
}
roc_mat <- roc_mat %>%
      arrange(tpr)


auc_mat <- pred_mat_lr_first %>%
      mutate(pred = coalesce(pred_fold_1,pred_fold_2,pred_fold_3,pred_fold_4,pred_fold_5),
             true_label = docvars(dfm_mayors,"party_in_power"),
             pred = recode(pred,"Dem"=1,"Rep"=0),
             true_label = recode(true_label,"Dem"=1,"Rep"=0)) %>% 
      select(mayor,true_label,pred)

roc_lr <- pROC::roc(auc_mat,response=true_label,predictor=pred)
(auc_lr <- pROC::auc(roc_lr))
writeLines(as.character(round(auc_lr,2)),con = "tables/lr_auc.tex",sep = "%")






(prob_correct_lr_byTW <- ggplot(mayor_summ) + 
            geom_point(aes(y=lr_prob_correct,x=mrp_ideology,col=party,pch=party)) +
            geom_smooth(data=filter(mayor_summ,party=="Dem"),aes(y=lr_prob_correct,x=mrp_ideology),formula = y ~ x, method="lm",col=blue_bbc) + 
            geom_smooth(data=filter(mayor_summ,party=="Rep"),aes(y=lr_prob_correct,x=mrp_ideology),formula = y ~ x, method="lm",col=red_bbc,lty=2) + 
            # geom_smooth(aes(y=svm_prob_dem,x=mrp_ideology),formula = y ~ x, method="lm") + 
            scale_shape_discrete("Mayoral PID") + 
            scale_color_manual("Mayoral PID",breaks = c("Dem","Rep"),values=c(blue_bbc,red_bbc)) + 
            scale_y_continuous(paste0("Ridge Prob. Correct")) + 
            scale_x_continuous("T&W Ideology") + 
            theme_minimal()
)
ggsave(prob_correct_lr_byTW,filename = "figures/prob_correct_lr_byTW.pdf",height=4,width=5)



(prob_correct_lr_bypres2008 <- ggplot(mayor_summ) + 
            geom_point(aes(y=lr_prob_correct,x=pres_2008,col=party,pch=party)) +
            geom_smooth(data=filter(mayor_summ,party=="Dem"),aes(y=lr_prob_correct,x=pres_2008),formula = y ~ x, method="lm",col=blue_bbc) + 
            geom_smooth(data=filter(mayor_summ,party=="Rep"),aes(y=lr_prob_correct,x=pres_2008),formula = y ~ x, method="lm",col=red_bbc,lty=2) + 
            # geom_smooth(aes(y=svm_prob_dem,x=pres_2008),formula = y ~ x, method="lm") + 
            scale_shape_discrete("Mayoral PID") + 
            scale_color_manual("Mayoral PID",breaks = c("Dem","Rep"),values=c(blue_bbc,red_bbc)) + 
            scale_y_continuous(paste0("Ridge Prob. Correct")) + 
            scale_x_continuous("2008 Democratic Presidential Voteshare") + 
            theme_minimal()
)
ggsave(prob_correct_lr_bypres2008,filename = "figures/prob_correct_lr_bypres2008.pdf",height=4,width=5)




## Lasso regression ------------------------------------------------------------

## tune model params:
# setup loop for k-fold classification
folds <- 5
lambda_seq <- seq(0.2,0.001,by=-0.001)
this_pred_lambdas <- NULL
pred_mat_lasso <- data.frame(rep(NA,nrow(dfm_mayors)))
validation_mat_lasso <- data.frame(rep(NA,nrow(dfm_mayors)))
set.seed(02139)
start_kfold <- Sys.time()
# create folds:
folded <- caret::createFolds(1:nrow(dfm_mayors),k = folds,returnTrain=T)
for(j in 1:length(folded)){
      quanteda::docvars(dfm_mayors,"partylab_this") <- docvars(dfm_mayors,"party_in_power")
      # setup training/test data
      indices <- folded[[j]] # training data
      # downsampling Dems to get class balance in training data:
      n_rep <- sum(docvars(dfm_mayors,"party_in_power")[indices] == "Rep")
      n_dem <- sum(docvars(dfm_mayors,"party_in_power")[indices] == "Dem")
      dem_ds_ind <- sample(indices[docvars(dfm_mayors,"party_in_power")[indices] == "Dem"],n_rep,replace = F)
      rep_ind <- indices[docvars(dfm_mayors,"party_in_power")[indices] == "Rep"]
      indices_downsampled <- c(dem_ds_ind,rep_ind)
      # set non-training data to be NA
      docvars(dfm_mayors,"partylab_this")[-indices_downsampled] <- NA
      this_out_lasso <- glmnet(x = as.matrix(dfm_mayors[indices_downsampled,]), 
                               y=docvars(dfm_mayors,"partylab_this")[indices_downsampled],
                               alpha=1,family="binomial",
                               lambda = lambda_seq)
      this_pred <- predict(this_out_lasso,newx=as.matrix(dfm_mayors[-indices,]),type="class")

      pred_mat_lasso[-indices,1:200] <- this_pred
      
      this_pred_lambdas <- bind_cols(this_pred_lambdas,this_out_lasso$lambda)
      
      pred_correct <- data.frame(docvars(dfm_mayors,"party_in_power") == this_pred)
      validation_mat_lasso[-indices,1:200] <- pred_correct
}

end_kfold <- Sys.time()
end_kfold - start_kfold # 17 seconds

## find best lambda that minimizes error:
dim(validation_mat_lasso) # 141 mayors x 200 lambda values
# avg accuracy by lambda:
avg_acc <- apply(validation_mat_lasso,2,mean)
summary(avg_acc) # best is 72% accuracy
this_pred_lambdas[which.max(avg_acc),] # 0.166
lambda.min <- lambda_seq[which.max(avg_acc)]

sims <- 1000
folds <- 5
ideal_mat_lasso <- data.frame(mayor = rownames(dfm_mayors))
pred_mat_lasso <- data.frame(mayor = rownames(dfm_mayors))
validation_mat_lasso <- data.frame(mayor = rownames(dfm_mayors))
set.seed(02139)
start_kfold <- Sys.time()
for(i in 1:sims){
      # create folds:
      folded <- caret::createFolds(1:nrow(dfm_mayors),k = folds,returnTrain=T)
      for(j in 1:length(folded)){
            quanteda::docvars(dfm_mayors,"partylab_this") <- docvars(dfm_mayors,"party_in_power")
            # setup training/test data
            indices <- folded[[j]] # training data
            # downsampling Dems to get class balance in training data:
            n_rep <- sum(docvars(dfm_mayors,"party_in_power")[indices] == "Rep")
            n_dem <- sum(docvars(dfm_mayors,"party_in_power")[indices] == "Dem")
            dem_ds_ind <- sample(indices[docvars(dfm_mayors,"party_in_power")[indices] == "Dem"],n_rep,replace = F)
            rep_ind <- indices[docvars(dfm_mayors,"party_in_power")[indices] == "Rep"]
            indices_downsampled <- c(dem_ds_ind,rep_ind)
            # set non-training data to be NA
            docvars(dfm_mayors,"partylab_this")[-indices_downsampled] <- NA
            this_out_lasso <- glmnet(x = as.matrix(dfm_mayors[indices_downsampled,]), 
                                     y=docvars(dfm_mayors,"partylab_this")[indices_downsampled],
                                     alpha=1,family="binomial")
            this_pred <- predict(this_out_lasso,newx=as.matrix(dfm_mayors[-indices,]),type="class",s=lambda.min)
            this_pred_prob <- predict(this_out_lasso,newx=as.matrix(dfm_mayors[-indices,]),type="response",s = lambda.min)
            # this_pred_prob <- predict(this_out_svm,type="prob")
            
            this_pred_df <- data.frame(rep(NA,nrow(dfm_mayors)))
            this_pred_df[-indices,1] <- as.character(this_pred)
            names(this_pred_df) <- paste0("pred_",i,"_fold_",j)
            pred_mat_lasso <- bind_cols(pred_mat_lasso,this_pred_df)
            
            this_ideal_df <- data.frame(rep(NA,nrow(dfm_mayors)))
            this_ideal_df[-indices,1] <- this_pred_prob
            names(this_ideal_df) <- paste0("ideal_",i,"_fold_",j)
            ideal_mat_lasso <- bind_cols(ideal_mat_lasso,this_ideal_df)
            
            pred_correct <- data.frame(docvars(dfm_mayors,"party_in_power") == this_pred_df)
            names(pred_correct) <- paste0("correct_",i,"_fold_",j)
            validation_mat_lasso <- bind_cols(validation_mat_lasso,pred_correct)
      }
}
end_kfold <- Sys.time()
end_kfold-start_kfold # 2.83 hours for 5-fold w/ 1000 sims

apply(select(pred_mat_lasso,pred_1_fold_1:last_col()),1,function(x) mean(x=="Dem",na.rm=T))


(accuracy_total_lasso <- apply(select(validation_mat_lasso,correct_1_fold_1:last_col()),1,mean,na.rm=T) %>%
            mean(na.rm=T)) # 64.8% correct
writeLines(as.character(round(100*accuracy_total_lasso,1)),"tables/lasso_accuracy_total.tex",sep = "%")


## confusion matrix (from sim 1, folds 1:k):
confusion_mat_lasso <- data.frame(pred_dem = c(sum(pred_mat_lasso[,2:6] == "Dem" & docvars(dfm_mayors,"party_in_power")=="Dem", na.rm=T), 
                                               sum(pred_mat_lasso[,2:6] == "Dem" & docvars(dfm_mayors,"party_in_power")=="Rep", na.rm=T)),
                                  pred_rep = c(sum(pred_mat_lasso[,2:6] == "Rep" & docvars(dfm_mayors,"party_in_power")=="Dem", na.rm=T), 
                                               sum(pred_mat_lasso[,2:6] == "Rep" & docvars(dfm_mayors,"party_in_power")=="Rep",na.rm=T))
)
rownames(confusion_mat_lasso) <- c("Actual Democrat","Actual Republican")
colnames(confusion_mat_lasso) <- c("Predicted Democrat","Predicted Republican")
print(xtable(confusion_mat_lasso),floating=F,file="tables/confusion_mat_lasso.tex")


mayor_summ$lasso_ideal <- apply(select(ideal_mat_lasso,ideal_1_fold_1:last_col()),1,mean,na.rm=T)
mayor_summ$lasso_prob_dem <- apply(select(pred_mat_lasso,pred_1_fold_1:last_col()),1,function(x) mean(x=="Dem",na.rm=T))
mayor_summ$lasso_prob_correct <- apply(select(validation_mat_lasso,correct_1_fold_1:last_col()),1,mean,na.rm=T)

## ROC Curve:
validation_mat_lasso_first <- data.frame(mayor = rownames(dfm_mayors))
ideal_mat_lasso_first <- data.frame(mayor = rownames(dfm_mayors))
pred_mat_lasso_first <- data.frame(mayor = rownames(dfm_mayors))
set.seed(02139)
folds <- 5
folded <- caret::createFolds(1:nrow(dfm_mayors),k = folds,returnTrain=T)
for(j in 1:length(folded)){
      quanteda::docvars(dfm_mayors,"partylab_this") <- docvars(dfm_mayors,"party_in_power")
      # setup training/test data
      indices <- folded[[j]] # training data
      # downsampling Dems to get class balance in training data:
      n_rep <- sum(docvars(dfm_mayors,"party_in_power")[indices] == "Rep")
      n_dem <- sum(docvars(dfm_mayors,"party_in_power")[indices] == "Dem")
      dem_ds_ind <- sample(indices[docvars(dfm_mayors,"party_in_power")[indices] == "Dem"],n_rep,replace = F)
      rep_ind <- indices[docvars(dfm_mayors,"party_in_power")[indices] == "Rep"]
      indices_downsampled <- c(dem_ds_ind,rep_ind)
      # set non-training data to be NA
      docvars(dfm_mayors,"partylab_this")[-indices_downsampled] <- NA
      this_out_lasso <- glmnet(x = as.matrix(dfm_mayors[indices_downsampled,]), 
                               y=docvars(dfm_mayors,"partylab_this")[indices_downsampled],
                               alpha=1,family="binomial")
      this_pred <- predict(this_out_lasso,newx=as.matrix(dfm_mayors[-indices,]),type="class",s=lambda.min)
      this_pred_prob <- predict(this_out_lasso,newx=as.matrix(dfm_mayors[-indices,]),type="response",s = lambda.min)
      # this_pred_prob <- predict(this_out_svm,type="prob")
      
      this_pred_df <- data.frame(rep(NA,nrow(dfm_mayors)))
      this_pred_df[-indices,1] <- as.character(this_pred)
      names(this_pred_df) <- paste0("pred_fold_",j)
      pred_mat_lasso_first <- bind_cols(pred_mat_lasso_first,this_pred_df)
      
      this_ideal_df <- data.frame(rep(NA,nrow(dfm_mayors)))
      this_ideal_df[-indices,1] <- this_pred_prob
      names(this_ideal_df) <- paste0("ideal_fold_",j)
      ideal_mat_lasso_first <- bind_cols(ideal_mat_lasso_first,this_ideal_df)
      
      pred_correct <- data.frame(docvars(dfm_mayors,"party_in_power") == this_pred_df)
      names(pred_correct) <- paste0("correct_fold_",j)
      validation_mat_lasso_first <- bind_cols(validation_mat_lasso_first,pred_correct)
}
ideal_mat_lasso_first <- ideal_mat_lasso_first %>%
      mutate(prob = coalesce(ideal_fold_1,ideal_fold_2,ideal_fold_3,ideal_fold_4,ideal_fold_5),
             true_label = docvars(dfm_mayors,"party_in_power")) %>% 
      select(mayor,true_label,prob)

roc_mat <- data.frame(threshold = sort(ideal_mat_lasso_first$prob),
                      tpr = NA,
                      fpr = NA)
for(j in 1:length(roc_mat$threshold)){
      roc_mat$tpr[j] <- sum(ideal_mat_lasso_first$prob>=roc_mat$threshold[j] & ideal_mat_lasso_first$true_label=="Rep")/sum(ideal_mat_lasso_first$true_label=="Rep")
      roc_mat$fpr[j] <- sum(ideal_mat_lasso_first$prob>=roc_mat$threshold[j] & ideal_mat_lasso_first$true_label!="Rep")/sum(ideal_mat_lasso_first$true_label!="Rep")
}

roc_mat <- roc_mat %>%
      arrange(tpr)


auc_mat <- pred_mat_lasso_first %>%
      mutate(pred = coalesce(pred_fold_1,pred_fold_2,pred_fold_3,pred_fold_4,pred_fold_5),
             true_label = docvars(dfm_mayors,"party_in_power"),
             pred = recode(pred,"Dem"=1,"Rep"=0),
             true_label = recode(true_label,"Dem"=1,"Rep"=0)) %>% 
      select(mayor,true_label,pred)

roc_lasso <- pROC::roc(auc_mat,response=true_label,predictor=pred)
(auc_lasso <- pROC::auc(roc_lasso)) # 0.6114
writeLines(as.character(round(auc_lasso,2)),con = "tables/lasso_auc.tex",sep = "%")



(prob_correct_lasso_byTW <- ggplot(mayor_summ) + 
            geom_point(aes(y=lasso_prob_correct,x=mrp_ideology,col=party,pch=party)) +
            geom_smooth(data=filter(mayor_summ,party=="Dem"),aes(y=lasso_prob_correct,x=mrp_ideology),formula = y ~ x, method="lm",col=blue_bbc) + 
            geom_smooth(data=filter(mayor_summ,party=="Rep"),aes(y=lasso_prob_correct,x=mrp_ideology),formula = y ~ x, method="lm",col=red_bbc,lty=2) + 
            # geom_smooth(aes(y=svm_prob_dem,x=mrp_ideology),formula = y ~ x, method="lm") + 
            scale_shape_discrete("Mayoral PID") + 
            scale_color_manual("Mayoral PID",breaks = c("Dem","Rep"),values=c(blue_bbc,red_bbc)) + 
            scale_y_continuous(paste0("Lasso Prob. Correct")) + 
            scale_x_continuous("T&W Ideology") + 
            theme_minimal()
)
ggsave(prob_correct_lasso_byTW,filename = "figures/prob_correct_lasso_byTW.pdf",height=4,width=5)

(prob_correct_lasso_bypres2008 <- ggplot(mayor_summ) + 
            geom_point(aes(y=lasso_prob_correct,x=pres_2008,col=party,pch=party)) +
            geom_smooth(data=filter(mayor_summ,party=="Dem"),aes(y=lasso_prob_correct,x=pres_2008),formula = y ~ x, method="lm",col=blue_bbc) + 
            geom_smooth(data=filter(mayor_summ,party=="Rep"),aes(y=lasso_prob_correct,x=pres_2008),formula = y ~ x, method="lm",col=red_bbc,lty=2) + 
            # geom_smooth(aes(y=svm_prob_dem,x=pres_2008),formula = y ~ x, method="lm") + 
            scale_shape_discrete("Mayoral PID") + 
            scale_color_manual("Mayoral PID",breaks = c("Dem","Rep"),values=c(blue_bbc,red_bbc)) + 
            scale_y_continuous(paste0("Lasso Prob. Correct")) + 
            scale_x_continuous("2008 Democratic Presidential Voteshare") + 
            theme_minimal()
)
ggsave(prob_correct_lasso_bypres2008,filename = "figures/prob_correct_lasso_bypres2008.pdf",height=4,width=5)





## Local finances data: --------------------------------------------------------

IndFin <- read_rds("IndFin_prcities.rds")
IndFin <- IndFin %>%
      group_by(place_fips) %>%
      mutate(expenditure.PC_ln = log1p(expenditure.PC),
             expenditure.PC_ln_lead1 = lead.new(expenditure.PC_ln,n=1,along_with = Year4),
             expenditure.PC_ln_lead2 = lead.new(expenditure.PC_ln,n=2,along_with = Year4),
             expenditure.PC_ln_lead3 = lead.new(expenditure.PC_ln,n=3,along_with = Year4),
             expenditure.PC_ln_lead4 = lead.new(expenditure.PC_ln,n=4,along_with = Year4),
             expenditure.PC_ln_delta1 = expenditure.PC_ln_lead1 - expenditure.PC_ln,
             expenditure.PC_ln_delta2 = expenditure.PC_ln_lead2 - expenditure.PC_ln,
             expenditure.PC_ln_delta3 = expenditure.PC_ln_lead3 - expenditure.PC_ln,
             expenditure.PC_ln_delta4 = expenditure.PC_ln_lead4 - expenditure.PC_ln,
             )

IndFin_formerge <- IndFin %>%
      select(place_fips,Year4,contains("expenditure.PC"))
IndFin_formerge <- IndFin_formerge %>%
      rowwise() %>%
      mutate(expenditure.PC_ln_lead4avg = mean(c(expenditure.PC_ln_lead1,expenditure.PC_ln_lead2,expenditure.PC_ln_lead3,expenditure.PC_ln_lead4),na.rm=T),
             expenditure.PC_ln_delta4avg = mean(c(expenditure.PC_ln_delta1,expenditure.PC_ln_delta2,expenditure.PC_ln_delta3,expenditure.PC_ln_delta4),na.rm=T))


mayor_summ <- mayor_summ %>%
      mutate(year_term_starts = as.numeric(str_replace(mayor,".*(\\d{4})$","\\1")))
mayor_summ <- left_join(mayor_summ,IndFin_formerge,by=c("place_fips" = "place_fips",
                                                        "year_term_starts" = "Year4"))




## Tables: ---------------------------------------------------------------------

fit_correct_pres2008_bypid_cl <- lfe::felm(svm_prob_correct ~ pres_2008*party | 0 | 0 | PRcity, data=mayor_summ)
fit_correct_TW_bypid_cl <- lfe::felm(svm_prob_correct ~ mrp_ideology*party | 0 | 0 | PRcity, data=mayor_summ)

fit_correct_pres2008_exp_bypid_cl <- lfe::felm(svm_prob_correct ~ pres_2008*party + expenditure.PC_ln_lead4avg | 0 | 0 | PRcity, data=mayor_summ)
fit_correct_TW_exp_bypid_cl <- lfe::felm(svm_prob_correct ~ mrp_ideology*party + expenditure.PC_ln_lead4avg | 0 | 0 | PRcity, data=mayor_summ)

stargazer(fit_correct_pres2008_bypid_cl,fit_correct_TW_bypid_cl,fit_correct_pres2008_exp_bypid_cl,fit_correct_TW_exp_bypid_cl,
          dep.var.labels = "SVM Prob. Correct",
          float = F,
          omit.stat = "ser",
          covariate.labels = c("2008 Democratic Presidential voteshare",
                               "T\\&W Ideology",
                               "Republican Mayor",
                               "Avg. Logged Expenditures PC",
                               "2008 Democratic Presidential voteshare $\\times$ Rep. Mayor",
                               "T\\&W Ideology $\\times$ Rep. Mayor"),
          out = "tables/intx_models_wExp2_cl.tex",
          notes="Standard errors clustered by city.")



fit_correct_lr_pres2008_bypid_cl <- lfe::felm(lr_prob_correct ~ pres_2008*party | 0 | 0 | PRcity, data=mayor_summ)
fit_correct_lr_TW_bypid_cl <- lfe::felm(lr_prob_correct ~ mrp_ideology*party | 0 | 0 | PRcity, data=mayor_summ)

fit_correct_lr_pres2008_exp_bypid_cl <- lfe::felm(lr_prob_correct ~ pres_2008*party + expenditure.PC_ln_lead4avg | 0 | 0 | PRcity, data=mayor_summ)
fit_correct_lr_TW_exp_bypid_cl <- lfe::felm(lr_prob_correct ~ mrp_ideology*party + expenditure.PC_ln_lead4avg | 0 | 0 | PRcity, data=mayor_summ)


stargazer(fit_correct_lr_pres2008_bypid_cl,fit_correct_lr_TW_bypid_cl,fit_correct_lr_pres2008_exp_bypid_cl,fit_correct_lr_TW_exp_bypid_cl,
          dep.var.labels = "Ridge Prob. Correct",
          float = F,
          omit.stat = "ser",
          covariate.labels = c("2008 Democratic Presidential voteshare",
                               "T\\&W Ideology",
                               "Republican Mayor",
                               "Avg. Logged Expenditures PC",
                               "2008 Democratic Presidential voteshare $\\times$ Rep. Mayor",
                               "T\\&W Ideology $\\times$ Rep. Mayor"),
          out = "tables/intx_models_wExp2_cl_lr.tex",
          notes="Standard errors clustered by city.")


fit_correct_lasso_pres2008_bypid_cl <- lfe::felm(lasso_prob_correct ~ pres_2008*party | 0 | 0 | PRcity, data=mayor_summ)
fit_correct_lasso_TW_bypid_cl <- lfe::felm(lasso_prob_correct ~ mrp_ideology*party | 0 | 0 | PRcity, data=mayor_summ)

fit_correct_lasso_pres2008_exp_bypid_cl <- lfe::felm(lasso_prob_correct ~ pres_2008*party + expenditure.PC_ln_lead4avg | 0 | 0 | PRcity, data=mayor_summ)
fit_correct_lasso_TW_exp_bypid_cl <- lfe::felm(lasso_prob_correct ~ mrp_ideology*party + expenditure.PC_ln_lead4avg | 0 | 0 | PRcity, data=mayor_summ)


stargazer(fit_correct_lasso_pres2008_bypid_cl,fit_correct_lasso_TW_bypid_cl,fit_correct_lasso_pres2008_exp_bypid_cl,fit_correct_lasso_TW_exp_bypid_cl,
          dep.var.labels = "Lasso Prob. Correct",
          float = F,
          omit.stat = "ser",
          covariate.labels = c("2008 Democratic Presidential voteshare",
                               "T\\&W Ideology",
                               "Republican Mayor",
                               "Avg. Logged Expenditures PC",
                               "2008 Democratic Presidential voteshare $\\times$ Rep. Mayor",
                               "T\\&W Ideology $\\times$ Rep. Mayor"),
          out = "tables/intx_models_wExp2_cl_lasso.tex",
          notes="Standard errors clustered by city.")

